library(atus)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(ggplot2)
library(RColorBrewer)

Race and Income

##Background Visualizations

Racial Composition of U.S.

The following bar graph represents the numbers of people of each race in the U.S. population. This provides a frame of reference for the U.S. population composition when viewing later data visualizations.

National Visualization

race_plot <- ggplot(data = atuscps, aes(x = race, fill = race)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Racial Composition of the U.S.", x = NULL, y = NULL, fill = "Race") +
  labs(title = "Regional Racial Compositions in the U.S.", x = "Race", y = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5)
    ) 
race_plot

Regional Visualization The regional racial distributions seem to be similar to the national one. From this visualization, it can be concluded that the most populated region of the U.S. is the South and the least is the Northeast.

race_plot2 <- ggplot(data = atuscps, aes(x = race, fill = race)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Racial Composition of the U.S.", x = NULL, y = NULL, fill = "Race") +
  labs(title = "Regional Racial Compositions in the U.S.", x = "Race", y = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5)
    ) +
  facet_wrap(~ region)
race_plot2

Subcategories of Race

Other demographic variables, such as citizenship status and country of origin, are relevant to race and may act as subcategories of race.

Citizenship Status Across Race

National Visualization Surprisingly, there seems to be a greater percentage of black people who hold citizenship than that of white people. This may be due to greater immigration from European countries. It is clear that Asians are most likely to not hold citizenship status among the racial categories studided.

citizenship_plot <- ggplot(data = atuscps, mapping = aes(x = race, fill = citizen)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "U.S. Citizenship Status Across Races", x = "Race", y = "Count", fill = "U.S. Citizen") +
  theme(
    plot.title = element_text(hjust = 0.5)
  )
citizenship_plot

Regional Visualization Aside from Asians, every other racial group consistently has at least 87.5% citizenship in each region. In the west, a greater proportion of Asians are citizens compared to those of the three other regions.

citizenship_plot2 <- ggplot(data = atuscps, mapping = aes(x = race, fill = citizen)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "U.S. Citizenship Status Across Races by Region", x = "Race", y = "Count", fill = "U.S. Citizen") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ region)
citizenship_plot2

Country of Origin

In the atuscps dataset, the variable listing each respondent’s country of origin is binary; repsondents were asked whether or not their country of origin is the U.S. Both the national and regional visualizations for this variable seem to correspond strongly with the U.S. citizenship status visualizations in the previous section.

National Visualization

country_plot <- ggplot(data = atuscps, mapping = aes(x = race, fill = country_born)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Countries of Origin Across Races", x = "Race", y = "Count", fill = "Country of Origin") +
  theme(
    plot.title = element_text(hjust = 0.5)
  )
country_plot

Regional Visualization

country_plot2 <- ggplot(data = atuscps, mapping = aes(x = race, fill = country_born)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Countries of Origin Across Races by Region", x = "Race", y = "Count", fill = "Country of Origin") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ region)
country_plot2

Race vs. Income

To visualize the possible relationship between race and income, three representations of income were used : family income brackets, average family income, and the ratio of the top 1% to the bottom 99%.

Observations with missing values have been removed from this dataset to better visualize the income graphs

income <- atuscps %>% 
  na.omit()
income

Family Income Brackets

Race As expected, whites and Asians are increasingly larger proportions of income brackets as they ascend. Conversely, blacks and other racial groups are decreasingly smaller proportions of income brackets as they ascend. This income disparity could be attributed to economic and social inequality.

fam_income <- ggplot(data = income, mapping = aes(x = famincome, fill = race)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Racial Breakdown of Each Income Bracket in the U.S.", x = "Family Income Bracket", y = "Count", fill = "Race") +
  theme(
    plot.title = element_text(hjust = 0.5)
  )
fam_income

Regional Differences in Income To determine whether the regional income distributions differed from those of the national visualization, the facet_wrap() function was used. Again, the regional trends are similar to those of the national one.

fam_income2 <- ggplot(data = income, mapping = aes(x = famincome, fill = race)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Racial Breakdown of Each Income Bracket in the U.S. by Region", x = "Family Income Bracket", y = "Count", fill = "Race") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ region)
fam_income2

U.S. Citizenship While the proportion of non-citizen blacks in each income bracket remains fairly consistent, the proportion of blacks with U.S. citizenship decreases as the income brackets increase. For whites, the proportion of people decreases for non-citizens and increases for citizens as the income brackets increase. The other racial groups seem to maintain the same distributions regardless of citizenship status. Although this requires further statistical analysis, these results may indicate that citizenship is a significant variable in predicting income.

fam_income3 <- ggplot(data = income, mapping = aes(x = famincome, fill = race)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Racial Breakdown of Each Income Bracket in the U.S. by Citizenship", x = "Family Income Bracket", y = "Count", fill = "Race") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ citizen)
fam_income3

Country of Origin The proportion of whites born outside of the U.S. decreases as the income bracket increases, which corresponds to the trend seen for non-citizen whites in the citizenship visualization. The proportion of blacks born outside of the U.S. remains fairly consistent across each income bracket, which again reflects the citizenship distribution. Distributions for the two other racial groups are similar to those of the national ones.

fam_income4 <- ggplot(data = income, mapping = aes(x = famincome, fill = race)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set1") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Racial Breakdown of Each Income Bracket in the U.S. by Country of Origin", x = "Family Income Bracket", y = "Count", fill = "Race") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ country_born)
fam_income4

Average Family Income

The dataset below calculates the average family income for each race within each region.

avg_income_by_race <- atuscps 
avg_income_by_race <-avg_income_by_race %>%
  separate(famincome, into=c('income_low','income_high'), sep='-',convert=TRUE)%>%
  mutate(income_low = as.integer(income_low)) %>%
  drop_na() %>%
  mutate(fam_income_mid = (income_high+income_low)/2) %>%
  group_by(race) %>%
  summarise(N = n(),
            avg_income = mean(fam_income_mid)) %>%
  arrange(avg_income)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15474 rows [205,
## 211, 248, 292, 311, 377, 400, 431, 504, 535, 554, 555, 581, 596, 611, 616, 655,
## 662, 710, 766, ...].
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
avg_income_by_race

Race Surprisingly, Asians have the highest average family income, followed by whites, other, then blacks.

avg_income_plot <- ggplot(data = avg_income_by_race, mapping = aes(x = race, y = avg_income)) +
  geom_col() +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Average U.S. Family Income by Race", x = "Race", y = "Average Family Income") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) 
avg_income_plot

Regional Differences in Income The average family income maintains the same distribution for each region as the general racial one in the previous visualization.

avg_income_by_race2 <- atuscps %>%
  separate(famincome, into=c('income_low','income_high'), sep='-',convert=TRUE)%>%
  mutate(income_low = as.integer(income_low)) %>%
  drop_na() %>%
  mutate(fam_income_mid = (income_high+income_low)/2) %>%
  group_by(race, region) %>%
  summarise(N = n(),
            avg_income = mean(fam_income_mid)) %>%
  arrange(avg_income)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15474 rows [205,
## 211, 248, 292, 311, 377, 400, 431, 504, 535, 554, 555, 581, 596, 611, 616, 655,
## 662, 710, 766, ...].
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## `summarise()` has grouped output by 'race'. You can override using the `.groups` argument.
avg_income_by_race2
avg_income_plot2 <- ggplot(data = avg_income_by_race2, mapping = aes(x = race, y = avg_income)) +
  geom_col() +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Regional Average U.S. Family Income by Race", x = "Race", y = "Average Family Income") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ region) +
  geom_text(aes(label = signif(avg_income, digits = 3)), angle = 90, 
              nudge_y = -15000, color = 'white')
avg_income_plot2

U.S. Citizenship For U.S. citizens, the distribution was the same as the ones for the previous two visualizations. Surprisingly, average family income was about $40,000 for all non-citizens except for Asians. Asian non-citizens had a considerably higher average family income than these other racial groups.

avg_income_by_race3 <- atuscps %>%
  separate(famincome, into=c('income_low','income_high'), sep='-',convert=TRUE)%>%
  mutate(income_low = as.integer(income_low)) %>%
  drop_na() %>%
  mutate(fam_income_mid = (income_high+income_low)/2) %>%
  group_by(race, citizen) %>%
  summarise(N = n(),
            avg_income = mean(fam_income_mid)) %>%
  arrange(avg_income)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15474 rows [205,
## 211, 248, 292, 311, 377, 400, 431, 504, 535, 554, 555, 581, 596, 611, 616, 655,
## 662, 710, 766, ...].
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## `summarise()` has grouped output by 'race'. You can override using the `.groups` argument.
avg_income_by_race3
avg_income_plot3 <- ggplot(data = avg_income_by_race3, mapping = aes(x = race, y = avg_income)) +
  geom_col() +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Average U.S. Family Income by Citizenship Status Across Race", x = "Race", y = "Average Family Income") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ citizen) +
  geom_text(aes(label = signif(avg_income, digits = 3)), angle = 90, 
              nudge_y = -15000, color = 'white')
avg_income_plot3

Country of Origin The country of origin trend mirrors that of the one for citizenship. This makes sense, as respondents who reported that they were born outside of the U.S. most likely do not hold U.S. citizenship.

avg_income_by_race4 <- atuscps %>%
  separate(famincome, into=c('income_low','income_high'), sep='-',convert=TRUE)%>%
  mutate(income_low = as.integer(income_low)) %>%
  drop_na() %>%
  mutate(fam_income_mid = (income_high+income_low)/2) %>%
  group_by(race, country_born) %>%
  summarise(N = n(),
            avg_income = mean(fam_income_mid)) %>%
  arrange(avg_income)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15474 rows [205,
## 211, 248, 292, 311, 377, 400, 431, 504, 535, 554, 555, 581, 596, 611, 616, 655,
## 662, 710, 766, ...].
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## `summarise()` has grouped output by 'race'. You can override using the `.groups` argument.
avg_income_by_race4
avg_income_plot4 <- ggplot(data = avg_income_by_race4, mapping = aes(x = race, y = avg_income)) +
  geom_col() +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Average U.S. Family Income by Citizenship Status Across Race", x = "Race", y = "Average Family Income") +
  theme(
    plot.title = element_text(hjust = 0.5)
  ) +
  facet_wrap(~ country_born) +
  geom_text(aes(label = signif(avg_income, digits = 3)), angle = 90, 
              nudge_y = -15000, color = 'white')
avg_income_plot4

Statistical Analysis

First, simple linear regresions were conducted to analyze the individual demographic varaibles from the study. Then, they were combined to form a multiple regression to create a more complex and realistic relationship to income.

income2 <- atuscps %>%
  separate(famincome, into=c('income_low','income_high'), sep='-',convert=TRUE)%>%
  mutate(income_low = as.integer(income_low)) %>%
  drop_na() %>%
  mutate(fam_income_mid = (income_high+income_low)/2) 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15474 rows [205,
## 211, 248, 292, 311, 377, 400, 431, 504, 535, 554, 555, 581, 596, 611, 616, 655,
## 662, 710, 766, ...].
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
income2
mod1 <- lm(fam_income_mid ~ race, data = income2)
summary(mod1)
## 
## Call:
## lm(formula = fam_income_mid ~ race, data = income2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -61183 -27646 -10146  27564  85064 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     55145.11      96.74  570.06   <2e-16 ***
## raceBlack only -15210.02     255.65  -59.50   <2e-16 ***
## raceOther       -8671.35     578.68  -14.98   <2e-16 ***
## raceAsian only  12286.94     511.35   24.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33970 on 152038 degrees of freedom
## Multiple R-squared:  0.02868,    Adjusted R-squared:  0.02866 
## F-statistic:  1496 on 3 and 152038 DF,  p-value: < 2.2e-16
mod2 <- lm(fam_income_mid ~ citizen, data = income2)
summary(mod2)
## 
## Call:
## lm(formula = fam_income_mid ~ citizen, data = income2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47881 -26631  -9131  33369  82514 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42485.2      320.5  132.56   <2e-16 ***
## citizenyes   11644.9      333.3   34.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34330 on 152040 degrees of freedom
## Multiple R-squared:  0.007964,   Adjusted R-squared:  0.007957 
## F-statistic:  1221 on 1 and 152040 DF,  p-value: < 2.2e-16
mod3 <- lm(fam_income_mid ~ country_born, data = income2)
summary(mod3)
## 
## Call:
## lm(formula = fam_income_mid ~ country_born, data = income2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47988 -26738  -9238  33262  77475 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        54237.79      95.48  568.06   <2e-16 ***
## country_bornnon-US -6712.89     249.07  -26.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34390 on 152040 degrees of freedom
## Multiple R-squared:  0.004755,   Adjusted R-squared:  0.004749 
## F-statistic: 726.4 on 1 and 152040 DF,  p-value: < 2.2e-16

To represent the more complex relationships between all of these demographic variables and income, I used multiple regression

mod4 <- lm(fam_income_mid ~ race + citizen + country_born,  data = income2)
summary(mod4)
## 
## Call:
## lm(formula = fam_income_mid ~ race + citizen + country_born, 
##     data = income2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69055 -27193  -8443  26557  99441 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         46051.0      462.2   99.63   <2e-16 ***
## raceBlack only     -15564.4      254.0  -61.28   <2e-16 ***
## raceOther           -8503.4      574.7  -14.80   <2e-16 ***
## raceAsian only      18797.7      535.9   35.08   <2e-16 ***
## citizenyes          10455.4      451.8   23.14   <2e-16 ***
## country_bornnon-US  -4927.7      348.4  -14.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33730 on 152036 degrees of freedom
## Multiple R-squared:  0.04217,    Adjusted R-squared:  0.04214 
## F-statistic:  1339 on 5 and 152036 DF,  p-value: < 2.2e-16

Conclusion

Based on the results of the linear regressions conducted, all of the demographic variables studied are significant due to their small p-values (<2e-16). Unfortunately, this means that the impacts and importance of each variable cannot be distinguished since they returned the same p-values and are all concluded to be significant. In conclusion, race, citizenship, and country of origin are all significant predictors of family income.